#include <fstream>
#include <string>
#include <iostream>
#include <sstream>

#include <vector>
#include <map>
#include <set>
#include <queue>
#include <algorithm>
#include <limits.h>

using namespace std;

#include "Permutation.h"
Permutation PT;		// Permutation function object

/// global parameters
int K;				// length of shingle

int numShingles;	// number of shingles

int sigLength;		// length of signature

double threshold;	// similarity threshold

int _R;				// size of band


/// global variables
int numDoc;
vector<string> filenames;				// data filenames

// Original similarity
vector< set<int> > dictionaryMatrix;	// sparse matrix in rows
vector< set<int> > docDictionary;		// sparse matrix in columns

// Shingling similarity
vector< set<int> > shingleMatrix;		// sparse matrix in rows
vector< set<int> > docShingle;			// sparse matrix in columns

// Signature similarity
vector< vector<int> > signatureMatrix;	// dense signature matrix generated by minhashing

// Utility macros
#define contains(container, item) (container.find(item) != container.end())
#define message(m) cout << m << endl

// Join a queue into a single string
string constructShingle(queue<string> shingle)
{
	string result;
	int n = shingle.size();
	for (int i=0; i<n-1; i++)
	{
		result += shingle.front() + " ";
		shingle.pop();
	}

	result += shingle.front();

	return result;
}

/// Construct the original and shingle matrices
void constructShingleMatrix(int K, vector< set<int> >& shingleMatrixInRows, vector< set<int> >& shingleMatrixInColumns)
{
	// The map: shingle => document ids
	map< string, set<int> > shingleDocMap;

	/// Read the shingles from file
	char shingleFile[512];
	sprintf(shingleFile, "Data/shingle/Shingle_K=%d.txt", K);
	ifstream inF(shingleFile, ios::in);

	inF >> numShingles;

	while(!inF.eof())
	{
		// Get the shingle
		string shingle, nextWord;
		inF >> shingle;
		for (int j = 1; j < K; j++)	{
			inF >> nextWord;
			shingle += " " + nextWord;
		}
		
		// Special case, end of document
		if(inF.eof()) continue;

		// Create row for the shingle
		shingleDocMap[shingle] = set<int>();
	}


	/// Read the documents one by one
	for(int i=0; i<numDoc; i++)
	{
		string filename = filenames[i];

		ifstream inF(filename.c_str(), ios::in);
		if(!inF) {
			cout << filename << " doesn't exist [SKIPPING].\n";
			continue;
		}

		// start reading a new article
		queue<string> shingleQ;

		// Read the first K words
		string word;
		for (int j = 0; j < K; j++)	{
			inF >> word;
			shingleQ.push(word);
		}

		do {
			// insert shingle entry
			string shingle = constructShingle(shingleQ);

			// Check if this is a valid shingle
			if(contains(shingleDocMap, shingle))
				shingleDocMap[shingle].insert(i);

			// read the next word
			shingleQ.pop();
			inF >> word;
			shingleQ.push(word);
		}
		while (!inF.eof());

		inF.close();
	}

	/// Convert map to sparse matrix in rows
	for ( map< string, set<int> >::iterator itr = shingleDocMap.begin(); itr != shingleDocMap.end(); itr++)
		shingleMatrixInRows.push_back(itr->second);

	/// Construct the sparse matrix in columns
	shingleMatrixInColumns.resize(numDoc);
	for (int c = 0; c < numDoc; c++){
		for (int r = 0; r < numShingles; r++){
			if (contains( shingleMatrixInRows[r], c))
				shingleMatrixInColumns[c].insert(r);
		}
	}
}

/// Construct the signature matrix
void minhashing()
{
	// Initialize the signature matrix
	vector<int> initRow(numDoc, INT_MAX);
	signatureMatrix.resize(sigLength, initRow);

	// Simulate permutation
	vector< vector<int> > &M = signatureMatrix;
	for (int r = 0; r < numShingles; r++){
		for (int c = 0; c < numDoc; c++){
			if ( contains(shingleMatrix[r], c) ){
				for (int i = 0; i < sigLength; i++)
				{
					int h = PT.hash(i, r);
					if (h < M[i][c])
						M[i][c] = h;
				}
			}
		}
	}
}

/// Jaccard similarity
double JS(void * data, bool sparse, int c1, int c2)
{
	int numUnion = 0;
	int numIntersection = 0;

	if(sparse)
	{
		// M in columns
		vector< set<int> > & M = *((vector< set<int> > *) data);

		set<int> set1 = M[c1];
		set<int> set2 = M[c2];

		vector<int> unionVec(set1.size() + set2.size());
		vector<int>::iterator lastUnionItem = set_union(set1.begin(), set1.end(), set2.begin(), set2.end(), unionVec.begin());
		numUnion = lastUnionItem - unionVec.begin();

		vector<int> intersectionVec( max(set1.size(), set2.size()));
		vector<int>::iterator lastIntersectionItem = set_intersection(set1.begin(), set1.end(), set2.begin(), set2.end(), intersectionVec.begin());
		numIntersection = lastIntersectionItem - intersectionVec.begin();
	}
	else
	{
		// M in rows
		vector< vector<int> > & M = *((vector< vector<int> > *) data);

		numUnion = M.size();
		for(int r = 0; r < (int) M.size(); r++)
			if(M[r][c1] == M[r][c2])	
				numIntersection++;
	}

	return double(numIntersection) / numUnion;
}

/// The ground truth
void computeGroundTruth(void * data, bool sparse, multimap< double, pair<int, int> > & similarity,  set< pair<int,int> > & similarPairs)
{
	for(int i = 0; i < numDoc; i++)	
		for(int j = i+1; j< numDoc; j++)
		{
			pair<int, int> p = make_pair(i, j);
			double s = JS(data, sparse, i, j);

			similarity.insert( make_pair(s,p) );

			if(s >= threshold)
				similarPairs.insert(p);
		}
}

/// Locality Sensitive Hashing (LSH)
void LSH(set< pair<int,int> > & candidatePairs)
{
	// In case
	candidatePairs.clear();

	vector< vector <int> > &M = signatureMatrix;
	// Calculate B
	int B = M.size() / _R;
	int R = _R;
	if(B == 0){
		B = 1;
		R = M.size();
	}

	// For each band
	for (int b = 0; b < B; b++ )
	{
		// The buckets of document ids
		map< vector<int>, set<int> > buckets;

		// For each column
		for (int c = 0; c < numDoc; c++)
		{
			// Get the band
			vector<int> band;			
			for(int r = b * R; r < (b+1)*R; r++)
				band.push_back(M[r][c]);

			// Hash band to bucket
			buckets[band].insert(c);
		}

		// Save the candidate pairs
		for(map< vector<int>, set<int> >::iterator itr = buckets.begin(); itr != buckets.end() ; itr++)
		{
			// If the set contains one single id
			if(itr->second.size() < 2) continue;

			// Convert set to vector
			vector<int> similarColumns;
			for(set<int>::iterator it = itr->second.begin(); it != itr->second.end(); it++)
				similarColumns.push_back(*it);

			// Make the pairs
			for(int i = 0; i < (int) similarColumns.size(); i++)
				for(int j = i+1; j < (int) similarColumns.size(); j++)
					candidatePairs.insert( make_pair(similarColumns[i], similarColumns[j]) );
		}
	}

}

/// Compute FP & FN
int inFirstNotSecond(set< pair<int, int> > &first, set< pair<int, int> > &second)
{
	int count = 0;
	for(set< pair<int, int> >::iterator itr = first.begin(); itr != first.end(); itr++)
		if(!contains(second, *itr)) count++;
	return count;
}

int main(int argc, char ** argv)
{
	/// Command line check
	int N = argc;
	if (N < 4)	{
		cout << "Invalid parameters!\n\n";
		cout << "Usage: " << argv[0] << " (K) (threshold) (R)\n\n";
		cout << "e.g.   " << argv[0] << " 2 0.2 5\n\n";
		return -1;
	}

	/// 1) Construct the shingle matrix
	{
		// Length of the shingle
		K = atoi(argv[1]);
		
		// The document folder and number of documents
		string doc_folder = "Data/data/";
		numDoc = 11;
		
		// Prepare for document filenames: assume they are named [i].txt starting from [1]
		char buffer[512];
		for(int i = 0; i < numDoc; i++)
		{		
			sprintf(buffer, "%s/D%d.txt", doc_folder.c_str(), (i+1));
			filenames.push_back(buffer);
		}

		// Read documents and fill in shingle matrix

		message("Constructing the original matrix. ");
		constructShingleMatrix(1, dictionaryMatrix, docDictionary);		

		message("Constructing the shingle matrix. ");
		constructShingleMatrix(K, shingleMatrix, docShingle);
	}


	/// 2) Minhashing
	{
		// a) Permutation file
		char per_filename[512];
		sprintf(per_filename, "Data/permutation/Permutation_K=%d.txt", K);
		PT.load(per_filename, numShingles, sigLength);

		message("Minhashing..");

		// b) Construct the signature matrix
		minhashing();
	}

	/// 3) Ground truth by Jaccard similarity
	set< pair<int, int> > similarPairs_original;
	set< pair<int, int> > similarPairs_shingle;
	set< pair<int, int> > similarPairs_signature;

	multimap< double, pair<int, int> > JS_original;
	multimap< double, pair<int, int> > JS_shingle;
	multimap< double, pair<int, int> > JS_signature;
	{
		threshold = atof(argv[2]);

		message("Computing ground truth..");

		computeGroundTruth((void*)&docDictionary, true, JS_original, similarPairs_original);
		computeGroundTruth((void*)&docShingle, true, JS_shingle, similarPairs_shingle);
		computeGroundTruth((void*)&signatureMatrix, false, JS_signature, similarPairs_signature);

		// Output all the similarities
		if (0)
		{
			multimap< double, pair<int, int> >::iterator itr;
			ofstream outSimF("sim_shingle_K=.txt", ios::out);
			for (itr = JS_shingle.begin(); itr != JS_shingle.end(); itr++)
				outSimF << itr->first << endl;
			outSimF.close();

			outSimF.open("sim_signature_K=.txt", ios::out);
			for (itr = JS_signature.begin(); itr != JS_signature.end(); itr++)
				outSimF << itr->first << endl;
			outSimF.close();
		}
	}
	
	/// 4) LSH
	set< pair<int, int> > candidatePairs;
	{
		_R = atoi(argv[3]);

		message("LSH for candidate pairs..");

		LSH(candidatePairs);
	}

	/// 5) Results
	{
		int FP_original = inFirstNotSecond(candidatePairs, similarPairs_original);
		int FN_original = inFirstNotSecond(similarPairs_original, candidatePairs);

		int FP_shingle = inFirstNotSecond(candidatePairs, similarPairs_shingle);
		int FN_shingle = inFirstNotSecond(similarPairs_shingle, candidatePairs);
		
		int FP_signature = inFirstNotSecond(candidatePairs, similarPairs_signature);
		int FN_signature = inFirstNotSecond(similarPairs_signature, candidatePairs);
	
		cout << "\n\n==\nResults (" << numDoc << " documents) \n\n\t K = " << K << "\n\t T = " << threshold << "\n\t R = " << _R << ", B = " << 100/_R << " \n" << endl;
		cout << "\tOriginal\t|\tShingling\t|\tSignature" << endl;
		cout << "\tFP\tFN\t|\tFP\tFN\t|\tFP\tFN" << endl;
		cout << "\t" << FP_original << "\t" << FN_original << "\t|\t" << 
						FP_shingle << "\t" << FN_shingle << "\t|\t" << 
						FP_signature << "\t" << FN_signature << endl;
	

		/// 5.1) Save results to file
		if(0)
		{
			char out_filename[512];
			sprintf(out_filename, "result_k=%d_r=%d.txt", K, _R);
			ofstream outF( out_filename );

			outF << _R << "\t" << FP_original << "\t" << FN_original << "\t" << FP_shingle << "\t" << FN_shingle << "\t" << FP_signature << "\t" << FN_signature;

			outF.close();
		}
	}

	printf("\nThank you!\n");
}
